##read data

strawberry=read.csv("strawberry.csv",header = T)

##Data Overview

summary(strawberry)
##    Program               Year         Period          Week.Ending   
##  Length:4314        Min.   :2016   Length:4314        Mode:logical  
##  Class :character   1st Qu.:2016   Class :character   NA's:4314     
##  Mode  :character   Median :2018   Mode  :character                 
##                     Mean   :2018                                    
##                     3rd Qu.:2019                                    
##                     Max.   :2022                                    
##                                                                     
##   Geo.Level            State             State.ANSI    Ag.District   
##  Length:4314        Length:4314        Min.   : 1.00   Mode:logical  
##  Class :character   Class :character   1st Qu.: 6.00   NA's:4314     
##  Mode  :character   Mode  :character   Median :12.00                 
##                                        Mean   :16.46                 
##                                        3rd Qu.:21.00                 
##                                        Max.   :55.00                 
##                                        NA's   :86                    
##  Ag.District.Code  County        County.ANSI    Zip.Code        Region       
##  Mode:logical     Mode:logical   Mode:logical   Mode:logical   Mode:logical  
##  NA's:4314        NA's:4314      NA's:4314      NA's:4314      NA's:4314     
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  watershed_code Watershed       Commodity          Data.Item        
##  Min.   :0      Mode:logical   Length:4314        Length:4314       
##  1st Qu.:0      NA's:4314      Class :character   Class :character  
##  Median :0                     Mode  :character   Mode  :character  
##  Mean   :0                                                          
##  3rd Qu.:0                                                          
##  Max.   :0                                                          
##                                                                     
##     Domain          Domain.Category       Value              CV....         
##  Length:4314        Length:4314        Length:4314        Length:4314       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
## 
head(strawberry)
##   Program Year Period Week.Ending Geo.Level  State State.ANSI Ag.District
## 1  CENSUS 2021   YEAR          NA     STATE ALASKA          2          NA
## 2  CENSUS 2021   YEAR          NA     STATE ALASKA          2          NA
## 3  CENSUS 2021   YEAR          NA     STATE ALASKA          2          NA
## 4  CENSUS 2021   YEAR          NA     STATE ALASKA          2          NA
## 5  CENSUS 2021   YEAR          NA     STATE ALASKA          2          NA
## 6  CENSUS 2021   YEAR          NA     STATE ALASKA          2          NA
##   Ag.District.Code County County.ANSI Zip.Code Region watershed_code Watershed
## 1               NA     NA          NA       NA     NA              0        NA
## 2               NA     NA          NA       NA     NA              0        NA
## 3               NA     NA          NA       NA     NA              0        NA
## 4               NA     NA          NA       NA     NA              0        NA
## 5               NA     NA          NA       NA     NA              0        NA
## 6               NA     NA          NA       NA     NA              0        NA
##      Commodity                                                   Data.Item
## 1 STRAWBERRIES               STRAWBERRIES, ORGANIC - OPERATIONS WITH SALES
## 2 STRAWBERRIES         STRAWBERRIES, ORGANIC - PRODUCTION, MEASURED IN CWT
## 3 STRAWBERRIES                STRAWBERRIES, ORGANIC - SALES, MEASURED IN $
## 4 STRAWBERRIES              STRAWBERRIES, ORGANIC - SALES, MEASURED IN CWT
## 5 STRAWBERRIES STRAWBERRIES, ORGANIC, FRESH MARKET - OPERATIONS WITH SALES
## 6 STRAWBERRIES  STRAWBERRIES, ORGANIC, FRESH MARKET - SALES, MEASURED IN $
##           Domain                      Domain.Category Value CV....
## 1 ORGANIC STATUS ORGANIC STATUS: (NOP USDA CERTIFIED)     2    (H)
## 2 ORGANIC STATUS ORGANIC STATUS: (NOP USDA CERTIFIED)   (D)    (D)
## 3 ORGANIC STATUS ORGANIC STATUS: (NOP USDA CERTIFIED)   (D)    (D)
## 4 ORGANIC STATUS ORGANIC STATUS: (NOP USDA CERTIFIED)   (D)    (D)
## 5 ORGANIC STATUS ORGANIC STATUS: (NOP USDA CERTIFIED)     2    (H)
## 6 ORGANIC STATUS ORGANIC STATUS: (NOP USDA CERTIFIED)   (D)    (D)

##Data preparing ###Remove columns with a single value in all columns (from giving qmd)

#define the function
drop_one_value_col <- function(df){
drop <- NULL
for(i in 1:dim(df)[2]){ #1:colume number
if((df |> distinct(df[,i]) |> count()) == 1){ #if only have one value, add i in drop
drop = c(drop, i)
} }

if(is.null(drop)){return("none")}else{

   print("Columns dropped:")
   print(colnames(df)[drop])
   strawberry <- df[, -1*drop]
}
}
#use function
strawberry_dropOneValue=drop_one_value_col(strawberry)
## [1] "Columns dropped:"
##  [1] "Week.Ending"      "Geo.Level"        "Ag.District"      "Ag.District.Code"
##  [5] "County"           "County.ANSI"      "Zip.Code"         "Region"          
##  [9] "watershed_code"   "Watershed"        "Commodity"
head(strawberry_dropOneValue)
##   Program Year Period  State State.ANSI
## 1  CENSUS 2021   YEAR ALASKA          2
## 2  CENSUS 2021   YEAR ALASKA          2
## 3  CENSUS 2021   YEAR ALASKA          2
## 4  CENSUS 2021   YEAR ALASKA          2
## 5  CENSUS 2021   YEAR ALASKA          2
## 6  CENSUS 2021   YEAR ALASKA          2
##                                                     Data.Item         Domain
## 1               STRAWBERRIES, ORGANIC - OPERATIONS WITH SALES ORGANIC STATUS
## 2         STRAWBERRIES, ORGANIC - PRODUCTION, MEASURED IN CWT ORGANIC STATUS
## 3                STRAWBERRIES, ORGANIC - SALES, MEASURED IN $ ORGANIC STATUS
## 4              STRAWBERRIES, ORGANIC - SALES, MEASURED IN CWT ORGANIC STATUS
## 5 STRAWBERRIES, ORGANIC, FRESH MARKET - OPERATIONS WITH SALES ORGANIC STATUS
## 6  STRAWBERRIES, ORGANIC, FRESH MARKET - SALES, MEASURED IN $ ORGANIC STATUS
##                        Domain.Category Value CV....
## 1 ORGANIC STATUS: (NOP USDA CERTIFIED)     2    (H)
## 2 ORGANIC STATUS: (NOP USDA CERTIFIED)   (D)    (D)
## 3 ORGANIC STATUS: (NOP USDA CERTIFIED)   (D)    (D)
## 4 ORGANIC STATUS: (NOP USDA CERTIFIED)   (D)    (D)
## 5 ORGANIC STATUS: (NOP USDA CERTIFIED)     2    (H)
## 6 ORGANIC STATUS: (NOP USDA CERTIFIED)   (D)    (D)

###Overview the value of each colume.

value_unique=lapply(strawberry_dropOneValue, function(x) head(unique(x), 5))
value_unique
## $Program
## [1] "CENSUS" "SURVEY"
## 
## $Year
## [1] 2021 2019 2016 2022 2020
## 
## $Period
## [1] "YEAR"                "MARKETING YEAR"      "YEAR - AUG FORECAST"
## 
## $State
## [1] "ALASKA"      "CALIFORNIA"  "CONNECTICUT" "FLORIDA"     "GEORGIA"    
## 
## $State.ANSI
## [1]  2  6  9 12 13
## 
## $Data.Item
## [1] "STRAWBERRIES, ORGANIC - OPERATIONS WITH SALES"              
## [2] "STRAWBERRIES, ORGANIC - PRODUCTION, MEASURED IN CWT"        
## [3] "STRAWBERRIES, ORGANIC - SALES, MEASURED IN $"               
## [4] "STRAWBERRIES, ORGANIC - SALES, MEASURED IN CWT"             
## [5] "STRAWBERRIES, ORGANIC, FRESH MARKET - OPERATIONS WITH SALES"
## 
## $Domain
## [1] "ORGANIC STATUS"        "TOTAL"                 "CHEMICAL, FUNGICIDE"  
## [4] "CHEMICAL, HERBICIDE"   "CHEMICAL, INSECTICIDE"
## 
## $Domain.Category
## [1] "ORGANIC STATUS: (NOP USDA CERTIFIED)"                              
## [2] "NOT SPECIFIED"                                                     
## [3] "CHEMICAL, FUNGICIDE: (AZOXYSTROBIN = 128810)"                      
## [4] "CHEMICAL, FUNGICIDE: (BACILLUS AMYLOLIQUEFAC F727 = 16489)"        
## [5] "CHEMICAL, FUNGICIDE: (BACILLUS AMYLOLIQUEFACIENS MBI 600 = 129082)"
## 
## $Value
## [1] "2"           " (D)"        "142"         "1,413,251"   "311,784,980"
## 
## $CV....
## [1] "(H)"  "(D)"  "19.2" "51.6" "46.0"

###Data processing of Value and CV…

#the value (D) means: Withheld to avoid disclosing data for individual operations. 
#the value (H) means: Coefficient of variation or generalized coefficient of variation is greater than or equal to 99.95 percent or the standard error is greater than or equal to 99.95 percent of the mean
straw_na <- strawberry_dropOneValue |> filter(CV....=="(H)"|CV....=="(D)"|Value=="(D)")
vals=strawberry_dropOneValue$Value
vals=sub(",","",vals)
vals=sub('""',"",vals)
vals=as.numeric(vals)
strawberry_dropOneValue["Value"]=vals
vals=strawberry_dropOneValue$CV....
vals=as.numeric(vals)
strawberry_dropOneValue["CV...."]=vals

###Classified by program

stb_census <- strawberry_dropOneValue |> filter(Program=="CENSUS")

## ## filter rows of California data from the SURVEY data
stb_survey <- strawberry_dropOneValue |> filter(Program=="SURVEY")

census_col <- colnames(stb_census)

survey_col <- colnames(stb_survey)
stb_census %>%
  group_by(State) %>%
  summarise(Total_Value = sum(Value, na.rm = TRUE))
## # A tibble: 46 × 2
##    State       Total_Value
##    <chr>             <dbl>
##  1 ALABAMA               6
##  2 ALASKA                4
##  3 ARIZONA               6
##  4 ARKANSAS              2
##  5 CALIFORNIA       444002
##  6 COLORADO          62236
##  7 CONNECTICUT      254148
##  8 FLORIDA          410406
##  9 GEORGIA           28065
## 10 IDAHO            205128
## # ℹ 36 more rows
stb_survey %>%
  group_by(State) %>%
  summarise(Total_Value = sum(Value, na.rm = TRUE))
## # A tibble: 11 × 2
##    State          Total_Value
##    <chr>                <dbl>
##  1 CALIFORNIA       11639437.
##  2 FLORIDA           3859748.
##  3 MICHIGAN                0 
##  4 NEW YORK           422903 
##  5 NORTH CAROLINA    2290141.
##  6 OHIO                    0 
##  7 OREGON            2084918.
##  8 OTHER STATES       591108.
##  9 PENNSYLVANIA            0 
## 10 WASHINGTON        1154029.
## 11 WISCONSIN               0
year_census <- stb_census %>%
  group_by(Year) %>%
  summarise(Sum_Value = sum(Value, na.rm = TRUE))
year_survey <- stb_survey %>%
  group_by(Year) %>%
  summarise(Sum_Value = sum(Value, na.rm = TRUE))
ggplot(year_survey) +
  aes(x = Year, y = Sum_Value) +
  geom_point(shape = "circle", size = 2.5, colour = "#112446") +
  theme_minimal()

Extract market names and chemical substances and their codes

stb_census <- stb_census %>%
  mutate(Data.Item = ifelse(
    str_detect(Data.Item, "MEASURED IN"),
    str_extract(Data.Item, "(?<=MEASURED IN ).*"),
    ifelse(str_detect(Data.Item, "SALES"), "SALES", Data.Item)
  ))
stb_survey <- stb_survey %>%
  mutate(
    Chemical = if_else(str_detect(Domain.Category, "\\(.*=.*\\)"),
                       str_extract(Domain.Category, "(?<=\\().*?(?=\\=)"),
                       NA_character_),
    Chemical_Code = if_else(str_detect(Domain.Category, "\\(.*=.*\\)"),
                            str_extract(Domain.Category, "(?<=\\=).*?(?=\\))"),
                            NA_character_)
  )
stb_census=subset(stb_census, !is.na(Value))
stb_survey=subset(stb_survey, !is.na(Value))
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(tools)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# average_values <- stb_census %>%
#   group_by(State) %>%
#   summarise(Average_Value = mean(Value, na.rm = TRUE))
us_states <- st_read("https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_5m.json")
## Reading layer `gz_2010_us_040_00_5m' from data source 
##   `https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_5m.json' 
##   using driver `GeoJSON'
## Simple feature collection with 52 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.1473 ymin: 17.92688 xmax: 179.7785 ymax: 71.35256
## Geodetic CRS:  WGS 84
capitalize_first <- function(string) {
  paste0(toupper(substr(string, 1, 1)), tolower(substr(string, 2, nchar(string))))
}

# df <- data.frame(State = sapply(average_values$State, capitalize_first), 
#                  Value = average_values$Value)
stb_census_money=stb_census|>
  filter(Data.Item=="$")
values <- stb_census_money %>%
  group_by(State,Year) %>%
  summarise(Value = mean(Value, na.rm = TRUE))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
values$State<-sapply(values$State, capitalize_first)
merged_data <- left_join(us_states, values, by = c("NAME" = "State"))

p <- ggplot(data = merged_data) +
  geom_sf(aes(fill = Value, frame = Year)) +
  scale_fill_gradient(low = "lightblue", high = "darkred") +
  theme_minimal() +
  labs(title = "Value by State", fill = "Value") +
  coord_sf(xlim = c(-170, -65), ylim = c(25, 72))
## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
## : Ignoring unknown aesthetics: frame
plotly_map <- ggplotly(p)
plotly_map
stb_census_sales=stb_census|>
  filter(Data.Item=="SALES")
values <- stb_census_sales %>%
  group_by(State,Year) %>%
  summarise(Value = mean(Value, na.rm = TRUE))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
values$State<-sapply(values$State, capitalize_first)
merged_data <- left_join(us_states, values, by = c("NAME" = "State"))

p <- ggplot(data = merged_data) +
  geom_sf(aes(fill = Value, frame = Year)) +
  scale_fill_gradient(low = "lightblue", high = "darkred") +
  theme_minimal() +
  labs(title = "Value by State", fill = "Value") +
  coord_sf(xlim = c(-170, -65), ylim = c(25, 72))
## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
## : Ignoring unknown aesthetics: frame
plotly_map <- ggplotly(p)
plotly_map
stb_census_cwt=stb_census|>
  filter(Data.Item=="CWT")
values <- stb_census_cwt %>%
  group_by(State,Year) %>%
  summarise(Value = mean(Value, na.rm = TRUE))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
values$State<-sapply(values$State, capitalize_first)
merged_data <- left_join(us_states, values, by = c("NAME" = "State"))

p <- ggplot(data = merged_data) +
  geom_sf(aes(fill = Value, frame = Year)) +
  scale_fill_gradient(low = "lightblue", high = "darkred") +
  theme_minimal() +
  labs(title = "Value by State", fill = "Value") +
  coord_sf(xlim = c(-170, -65), ylim = c(25, 72))
## Warning in layer_sf(geom = GeomSf, data = data, mapping = mapping, stat = stat,
## : Ignoring unknown aesthetics: frame
plotly_map <- ggplotly(p)
plotly_map
# stb_survey$Chemical_Code_num <- as.numeric(stb_survey$Chemical_Code)
# stb_survey$Chemical_Code_str <- ifelse(is.na(stb_survey$Chemical_Code_num),
#                                         NA,
#                                         sprintf("%06d", stb_survey$Chemical_Code_num))
# library(httr)
# library(jsonlite)
# get_cas <- function(PC){
#     path <- paste0("https://ordspub.epa.gov/ords/pesticides/apprilapi/?q=%7b%22ais%22:%7b%22$instr%22:%22", PC,"%22%7d%7d")
#     r <- GET(url = path)
#     r_text <- content(r, as = "text", encoding = "UTF-8")
#     df <- fromJSON(r_text, flatten = TRUE)
#     df_strwb <- df$items[grepl("Strawberries", df$items$sites, fixed=T),]
#     ais <- df_strwb$ais[1]
#     pattern <- "\\(([^A-Za-z]+)\\/([0-9-]+)\\)"
#     text <- ais
#     matches <- regmatches(text, gregexpr(pattern, text))
#     cas <- sapply(matches, function(x) gsub(".*\\/([0-9-]+)\\)", "\\1", x)) 
#     if (is.character(cas)) {
#         return(cas[1])
# } 
#     else {
#         return("can't find")
# }
# }
# unique_stb=unique(stb_survey$Chemical_Code_str)
# result=numeric()
# k=numeric()
# for(i in 1:length(unique_stb)){
#   result[i]=get_cas(unique_stb[i])
#   k[i]=unique_stb[i]
#   print(result[i])
# }
# data_save=data.frame(k,result)
# write.csv(data_save,"/Users/bingtianye/Desktop/data_save.csv",row.names = F)
data_save=read.csv("/Users/bingtianye/Desktop/bu_study/MA615 Data Science in R/exercise/data_save",header=T)
data_save$Chemical_Code_num <- as.numeric(data_save$k)
data_save$Chemical_Code_str <- ifelse(is.na(data_save$Chemical_Code_num),
                                         NA,
                                         sprintf("%06d", data_save$Chemical_Code_num))
po=read.csv("/Users/bingtianye/Desktop/bu_study/MA615 Data Science in R/exercise/CAS.csv",header=T)
merged_data <- merge(data_save, po, by.x="result", by.y="chemical", all.x=TRUE)